Index:
Problem Statement:
XYZcart is India's largest fresh produce supply chain company. They are pioneers in solving one of the toughest supply chain problems of the world by leveraging innovative technology.An integral component of their automation process is the development of robust classifiers which can distinguish between images of different types of vegetables.
They have provided us with a dataset scraped from the web which contains train and test folders, each having 4 sub-folders with images of onions, potatoes, tomatoes and some market scenes. We have been tasked with preparing a multiclass classifier for identifying these vegetables.
import numpy as np
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import seaborn as sns
import random
import time
import math
import tensorflow as tf
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import PIL
from PIL import Image
from sklearn.neighbors import NearestNeighbors
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
import glob
from tensorflow.keras import layers, regularizers
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline
import pandas as pd
from mpl_toolkits.axes_grid1 import ImageGrid
import sklearn.metrics as metrics
import os
import zipfile
import shutil
#plt.rcParams.update({'font.size': 14})
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
import logging
tf.get_logger().setLevel(logging.ERROR)
Functions created for visualizing images
def img_reshape(imgpath):
img = Image.open(imgpath).convert('RGB')
#img = img.resize((300,300))
img = np.asarray(img)
return img
def show_grid(img_arr,nrows=5,ncols=5):
fig = plt.figure(figsize=(20., 20.))
grid = ImageGrid(fig, 111,
nrows_ncols=(5, 5), # creates 5x5 grid of axes
axes_pad=0.1, # pad between axes
)
for ax, im in zip(grid, img_arr):
ax.imshow(im)
ax.set_title(im.size)
plt.show()
Plot Train and Validation Accuracy
def annot_max(x,y, xytext=(0.94,0.96), ax=None, only_y=True):
xmax = x[np.argmax(y)]
ymax = max(y)
if only_y:
text = "{:.2f}%".format(ymax)
else:
text= "x={:.2f}, y={:.2f}%".format(xmax, ymax)
if not ax:
ax=plt.gca()
bbox_props = dict(boxstyle="square,pad=0.3", fc="w", ec="k", lw=0.72)
arrowprops=dict(arrowstyle="->",connectionstyle="angle,angleA=0,angleB=60")
kw = dict(xycoords='data',textcoords="axes fraction",
arrowprops=arrowprops, bbox=bbox_props, ha="right", va="top")
ax.annotate(text, xy=(xmax, ymax), xytext=xytext, **kw)
def plot_accuracy(model_fit):
#accuracy graph
x = range(0,len(model_fit.history['accuracy']))
y_train = [acc * 100 for acc in model_fit.history['accuracy']]
y_val = [acc * 100 for acc in model_fit.history['val_accuracy']]
plt.plot(x, y_train, label='Train', color='b')
annot_max(x, y_train, xytext=(0.7,0.9))
plt.plot(x, y_val, label='Val', color='r')
annot_max(x, y_val, xytext=(0.8,0.7))
plt.ylabel('Accuracy', fontsize=10)
plt.xlabel('epoch', fontsize=10)
plt.legend()
plt.show()
Functions to plot Model Accuracy and Confusion matrix
def print_accuracy_stats(model, ds,chkpnt_path):
plt.figure(figsize=(8,5))
model.load_weights(chkpnt_path)
true_categories = tf.concat([y for x, y in ds], axis=0)
y_pred = model.predict(ds)
predicted_categories = tf.argmax(y_pred, axis=1)
test_acc = metrics.accuracy_score(true_categories, predicted_categories) * 100
print(f'\nTest Accuracy: {test_acc:.2f}%\n')
def plot_confusion_matrix(model, ds,chkpnt_path,class_names):
plt.figure(figsize=(4,3))
model.load_weights(chkpnt_path)
true_categories = tf.concat([y for x, y in ds], axis=0)
y_pred = model.predict(ds)
predicted_categories = tf.argmax(y_pred, axis=1)
cm = metrics.confusion_matrix(true_categories,predicted_categories) # last batch
sns.heatmap(cm, annot=True, xticklabels=class_names, yticklabels=class_names, cmap="YlGnBu", fmt='g')
plt.show()
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
Mounted at /content/drive
cd /content/gdrive/MyDrive/..../XYZCart_Casestudy
/content/gdrive/MyDrive/Scaler/NinjaCart_Casestudy
!mkdir -p ProjectData
!gdown https://drive.google.com/uc?id=1clZX-lV_MLxKHSyeyTheX5OCQtNCUcqT --output /content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/
Downloading... From: https://drive.google.com/uc?id=1clZX-lV_MLxKHSyeyTheX5OCQtNCUcqT To: /content/gdrive/MyDrive/Scaler/NinjaCart_Casestudy/ProjectData/ninjacart_data.zip 100% 275M/275M [00:02<00:00, 132MB/s]
with zipfile.ZipFile("/content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data.zip","r") as zip_ref:
zip_ref.extractall("/content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/")
!mkdir -p /content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data/trainAndVal
os.rename('/content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data/train','/content/gdrive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data/trainAndVal')
or we can !unzip command as well
Note: Train data and Test data split is already done. lets create validation data from train data folder.
root_dir='/content/drive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data'
totaltrain_dir = '/content/drive/MyDrive/..../XYZCart_Casestudy/ProjectData/XYZcart_data/trainAndVal' # data root path
classes_dir = ['indian market','onion','potato','tomato'] #total labels
val_ratio = 0.15
for cls in classes_dir:
os.makedirs(root_dir +'/validation/' + cls)
os.makedirs(root_dir +'/train/' + cls)
for cls in classes_dir:
allFileNames = os.listdir(totaltrain_dir+"/"+cls)
np.random.shuffle(allFileNames)
train_FileNames, val_FileNames = np.split(np.array(allFileNames),[int(len(allFileNames)* (1 - (val_ratio)))])
for name in train_FileNames:
shutil.copy(totaltrain_dir+"/"+cls+"/"+name, root_dir +'/train/' + cls)
for name in val_FileNames:
shutil.copy(totaltrain_dir+"/"+cls+"/"+name, root_dir +'/validation/' + cls)
imagesCountDataframe=pd.DataFrame(columns=['DataSet','indian market','onion','potato','tomato'])
imagesCountDataframe['DataSet']=['train','validation','test']
for cls in classes_dir:
TrainFileNames = os.listdir(root_dir +'/train/' + cls)
ValFileNames = os.listdir(root_dir +'/validation/' + cls)
TestFileNames = os.listdir(root_dir +'/test/' + cls)
print("--------Stats of class name:"+cls+"------------------")
print("Train files count:"+str(len(TrainFileNames)))
imagesCountDataframe.loc[(imagesCountDataframe['DataSet']=='train'),cls]=len(TrainFileNames)
print("validation files count:"+str(len(ValFileNames)))
imagesCountDataframe.loc[(imagesCountDataframe['DataSet']=='validation'),cls]=len(ValFileNames)
print("test files count:"+str(len(TestFileNames)))
imagesCountDataframe.loc[(imagesCountDataframe['DataSet']=='test'),cls]=len(TestFileNames)
--------Stats of class name:indian market------------------ Train files count:509 validation files count:90 test files count:81 --------Stats of class name:onion------------------ Train files count:721 validation files count:128 test files count:83 --------Stats of class name:potato------------------ Train files count:763 validation files count:135 test files count:81 --------Stats of class name:tomato------------------ Train files count:670 validation files count:119 test files count:106
imagesCountDataframe
| DataSet | indian market | onion | potato | tomato | |
|---|---|---|---|---|---|
| 0 | train | 509 | 721 | 763 | 670 |
| 1 | validation | 90 | 128 | 135 | 119 |
| 2 | test | 81 | 83 | 81 | 106 |
Plotting barplots for class wise count of images understanding
fig, ax = plt.subplots(2, 2,figsize=(15, 7))
sns.set(font_scale=0.85)
sns.barplot(data=imagesCountDataframe, y="DataSet", x="indian market", errorbar="sd",width=0.2,ax=ax[0][0])
sns.barplot(data=imagesCountDataframe, y="DataSet", x="onion", errorbar="sd",width=0.2,ax=ax[0][1])
sns.barplot(data=imagesCountDataframe, y="DataSet", x="potato", errorbar="sd",width=0.2,ax=ax[1][0])
sns.barplot(data=imagesCountDataframe, y="DataSet", x="tomato", errorbar="sd",width=0.2,ax=ax[1][1])
<Axes: xlabel='tomato', ylabel='DataSet'>
Plotting image grids for each class in train set for dimension irregularities check
for cls in classes_dir:
img_arr=[]
TrainFileNames = os.listdir(root_dir +'/train/' + cls)
print("--------------- Train set : class is '"+cls+"'----------------")
for image in TrainFileNames:
img_arr.append(img_reshape(root_dir +'/train/' + cls +'/'+image))
show_grid(img_arr,5,5)
--------------- Train set : class is 'indian market'----------------
--------------- Train set : class is 'onion'----------------
--------------- Train set : class is 'potato'----------------
--------------- Train set : class is 'tomato'----------------
Plotting image grids for each class in validation set for dimension irregularities check
for cls in classes_dir:
img_arr=[]
TrainFileNames = os.listdir(root_dir +'/validation/' + cls)
print("--------------- validation set : class is '"+cls+"'----------------")
for image in TrainFileNames:
img_arr.append(img_reshape(root_dir +'/validation/' + cls +'/'+image))
show_grid(img_arr)
--------------- validation set : class is 'indian market'----------------
--------------- validation set : class is 'onion'----------------
--------------- validation set : class is 'potato'----------------
--------------- validation set : class is 'tomato'----------------
Plotting image grids for each class in test set for dimension irregularities check
for cls in classes_dir:
img_arr=[]
TrainFileNames = os.listdir(root_dir +'/test/' + cls)
print("--------------- test set : class is '"+cls+"'----------------")
for image in TrainFileNames:
img_arr.append(img_reshape(root_dir +'/test/' + cls +'/'+image))
show_grid(img_arr)
--------------- test set : class is 'indian market'----------------
--------------- test set : class is 'onion'----------------
--------------- test set : class is 'potato'----------------